Library Call¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ply

Data Preparation¶

In [2]:
nf=pd.read_csv(r"C:\Users\yuvraj\Downloads\netflix.csv",encoding='Latin')
In [3]:
nf.head()
Out[3]:
show_id type title director cast country date_added release_year rating duration ... Unnamed: 16 Unnamed: 17 Unnamed: 18 Unnamed: 19 Unnamed: 20 Unnamed: 21 Unnamed: 22 Unnamed: 23 Unnamed: 24 Unnamed: 25
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 26 columns

In [4]:
nf.columns
Out[4]:
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
       'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19',
       'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23',
       'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')

Resize the Dataset¶

In [5]:
nf=nf.iloc[:,:12]
In [6]:
nf.head()
Out[6]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...
In [52]:
# Basic Information
In [8]:
nf.shape
Out[8]:
(8809, 12)
In [9]:
nf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8809 entries, 0 to 8808
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8809 non-null   object
 1   type          8809 non-null   object
 2   title         8809 non-null   object
 3   director      6175 non-null   object
 4   cast          7984 non-null   object
 5   country       7978 non-null   object
 6   date_added    8799 non-null   object
 7   release_year  8809 non-null   int64 
 8   rating        8805 non-null   object
 9   duration      8806 non-null   object
 10  listed_in     8809 non-null   object
 11  description   8809 non-null   object
dtypes: int64(1), object(11)
memory usage: 826.0+ KB
In [10]:
nf.isna().sum()
Out[10]:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64
In [11]:
#Filling NaN Values

nf.director= nf.director.fillna("No Name")
nf.cast    = nf.cast.fillna("No Cast")
nf.country = nf.country.fillna("No Country")
nf.date_added= nf.date_added.fillna("No Date")
nf.rating  = nf.rating.fillna("No Rating")
nf.duration= nf.duration.fillna("No Duration")
In [12]:
nf.isna().sum()
Out[12]:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
In [13]:
nf.head()
Out[13]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson No Cast United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water No Name Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... No Country September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans No Name No Cast No Country September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory No Name Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...

Groupby(Type)¶

In [14]:
movie_tv=nf.groupby(['type']).size().reset_index(name='count')
movie_tv
Out[14]:
type count
0 Movie 6132
1 TV Show 2677
In [15]:
ply.pie(movie_tv,names='type',values='count')

Content Analysis¶

In [16]:
content=nf.groupby(by=['rating']).size().reset_index(name='count').nlargest(10,'count')
content.head()
Out[16]:
rating count
13 TV-MA 3208
11 TV-14 2160
14 TV-PG 863
10 R 799
9 PG-13 490
In [17]:
ply.bar(content,x='rating',y='count',text_auto=True)

Top Director¶

In [18]:
#stack diffrent name that belong to the same cell

director = nf['director'].str.split(',',expand=True).stack()
director = pd.DataFrame(director,columns=['Director'])
director = nf.groupby(by=['director']).size().reset_index(name='count').nlargest(15,'count')
director
Out[18]:
director count
3051 No Name 2634
3393 Rajiv Chilaka 19
3444 Raúl Campos, Jan Suter 18
2598 Marcus Raboy 16
4047 Suhas Kadav 16
1790 Jay Karas 14
685 Cathy Garcia-Molina 13
1787 Jay Chapman 12
2671 Martin Scorsese 12
4482 Youssef Chahine 12
4021 Steven Spielberg 11
1105 Don Michael Paul 10
973 David Dhawan 9
1282 Fernando Ayllón 8
1507 Hakan Algül 8
In [19]:
director=director[director['director'] !='No Name']
director
Out[19]:
director count
3393 Rajiv Chilaka 19
3444 Raúl Campos, Jan Suter 18
2598 Marcus Raboy 16
4047 Suhas Kadav 16
1790 Jay Karas 14
685 Cathy Garcia-Molina 13
1787 Jay Chapman 12
2671 Martin Scorsese 12
4482 Youssef Chahine 12
4021 Steven Spielberg 11
1105 Don Michael Paul 10
973 David Dhawan 9
1282 Fernando Ayllón 8
1507 Hakan Algül 8
In [20]:
ply.bar(director,x='director',y='count',text_auto=True)

Top Cast¶

In [21]:
#stack individual
casting_individual=nf['cast'].str.split(",",expand=True).stack().reset_index(drop=True)
casting_individual
Out[21]:
0                           No Cast
1                        Ama Qamata
2                       Khosi Ngema
3                     Gail Mabalane
4                    Thabang Molaba
                    ...            
64985             Ji?í Maria Sieber
64986                Raymond Waring
64987                   Petr Drozda
64988                    John Comer
64989     Benedetta Degli Innocenti
Length: 64990, dtype: object
In [23]:
cast_individual=pd.DataFrame(casting_individual,columns=['TotalCast'])
cast_individual.head()
Out[23]:
TotalCast
0 No Cast
1 Ama Qamata
2 Khosi Ngema
3 Gail Mabalane
4 Thabang Molaba
In [26]:
top_15_cast= cast_individual.groupby(by=['TotalCast']).size().reset_index(name='count')
top_15_cast= top_15_cast.sort_values(by=['count'],ascending=False)
top_15_cast.head()
Out[26]:
TotalCast count
37675 No Cast 825
2612 Anupam Kher 39
26965 Rupa Bhimani 31
30327 Takahiro Sakurai 30
15555 Julie Tejwani 28
In [27]:
top_15_cast= top_15_cast.iloc[1:,:]
In [28]:
top_15_cast= top_15_cast.nlargest(15,'count')
In [29]:
top_15_cast.head()
Out[29]:
TotalCast count
2612 Anupam Kher 39
26965 Rupa Bhimani 31
30327 Takahiro Sakurai 30
15555 Julie Tejwani 28
23642 Om Puri 27
In [32]:
ply.bar(top_15_cast,x='TotalCast',y='count',text_auto=True)

Listed In¶

In [34]:
#Stack diffrent name that belong to same cell
new_listed_in = nf['listed_in'].str.split(",",expand=True).stack().reset_index(drop=True)
new_listed_in = pd.DataFrame(new_listed_in,columns=['Listed'])
new_listed_in.head()
Out[34]:
Listed
0 Documentaries
1 International TV Shows
2 TV Dramas
3 TV Mysteries
4 Crime TV Shows
In [37]:
top_15_item = new_listed_in.groupby(by=['Listed']).size().reset_index(name='count')
top_15_item = top_15_item.nlargest(15,'count')
top_15_item
Out[37]:
Listed count
15 International Movies 2624
54 Dramas 1600
48 Comedies 1210
41 Action & Adventure 859
51 Documentaries 829
10 Dramas 827
58 International TV Shows 774
14 Independent Movies 736
33 TV Dramas 696
23 Romantic Movies 613
45 Children & Family Movies 605
16 International TV Shows 577
40 Thrillers 512
5 Comedies 464
32 TV Comedies 461
In [38]:
ply.bar(top_15_item,x='Listed',y='count',text_auto=True)

Type vs Country¶

In [40]:
content=nf.groupby(by=["type",'country']).size().reset_index(name='count').nlargest(16,'count')
content.head()
Out[40]:
type country count
526 Movie United States 2059
218 Movie India 893
813 TV Show United States 760
321 Movie No Country 440
754 TV Show No Country 391
In [42]:
content=content[content['country'] != 'No Country']
content
Out[42]:
type country count
526 Movie United States 2059
218 Movie India 893
813 TV Show United States 760
793 TV Show United Kingdom 213
441 Movie United Kingdom 206
735 TV Show Japan 169
773 TV Show South Korea 159
50 Movie Canada 122
385 Movie Spain 97
128 Movie Egypt 92
319 Movie Nigeria 86
718 TV Show India 79
238 Movie Indonesia 77
278 Movie Japan 76
In [51]:
plt.figure(figsize=(11,5))
sns.pointplot(x='country',y='count',hue='type',data=content)
plt.show()

Type and Release Year¶

In [54]:
type_and_year = nf.groupby(by=['type','release_year']).size().reset_index(name='count')
type_and_year = type_and_year[type_and_year['release_year']>2007]
type_and_year.head()
Out[54]:
type release_year count
59 Movie 2008 113
60 Movie 2009 118
61 Movie 2010 154
62 Movie 2011 145
63 Movie 2012 173
In [56]:
type_and_year.rename(columns={'type':'Type','release_year':'Release_Year'},inplace=True)
In [57]:
type_and_year.head()
Out[57]:
Type Release_Year count
59 Movie 2008 113
60 Movie 2009 118
61 Movie 2010 154
62 Movie 2011 145
63 Movie 2012 173
In [59]:
ply.line(type_and_year,x="Release_Year",y="count",color='Type')
In [67]:
sns.histplot(x="type",data=nf,edgecolor="black",bins=8,color="c")
plt.grid()
In [ ]:
 
In [ ]:
 
In [ ]: